# How soccer became a global sport: where did it start and what changed as more teams were starting to compete.
# Which countries have dominated the different eras of soccer since everything started.


# Cleaning, processing and first exploration

# As seen below, this data set consists of (supposedly) all games since the inaugural Scotland - England in 1872.
# For each game, we have the score, the tournament, the host city and country.

### Loading libraries
library(ggplot2) # Data visualization
## Warning: package 'ggplot2' was built under R version 4.2.2
library(readr) # CSV file I/O, e.g. the read_csv function
## Warning: package 'readr' was built under R version 4.2.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.2
# Reading input file.
df <- read_csv("C://Users//Nishtha//Documents//bhavuk//Semester 6//DV//J Comp//results.csv")
## Rows: 44353 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): home_team, away_team, tournament, city, country
## dbl  (2): home_score, away_score
## lgl  (1): neutral
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 9
##   date       home_team away_team home_sc…¹ away_…² tourn…³ city  country neutral
##   <date>     <chr>     <chr>         <dbl>   <dbl> <chr>   <chr> <chr>   <lgl>  
## 1 1872-11-30 Scotland  England           0       0 Friend… Glas… Scotla… FALSE  
## 2 1873-03-08 England   Scotland          4       2 Friend… Lond… England FALSE  
## 3 1874-03-07 Scotland  England           2       1 Friend… Glas… Scotla… FALSE  
## 4 1875-03-06 England   Scotland          2       2 Friend… Lond… England FALSE  
## 5 1876-03-04 Scotland  England           3       0 Friend… Glas… Scotla… FALSE  
## 6 1876-03-25 Scotland  Wales             4       0 Friend… Glas… Scotla… FALSE  
## # … with abbreviated variable names ¹​home_score, ²​away_score, ³​tournament
# Let's check if we hace some NA or NULL values we should clean.
# Apparently not. Good news, let's continue.
apply(df, 2, function(v) {length(which(is.na(v) | is.null(v)))})
##       date  home_team  away_team home_score away_score tournament       city 
##          0          0          0          0          0          0          0 
##    country    neutral 
##          0          0
# Let's process a bit the data so that we can have a quicker access to some important feature such as the result or the names of the winning or losing team. The outcome of a game will be encoded as D for draw, H for the home team winning and A for the away team winning. We will also extract some date-related features such as the day of week or month.
game_outcome <- function(home_score, away_score) {
  outcome <- "D"
  if (home_score > away_score) {outcome <- "H"}
  if (home_score < away_score) {outcome <- "A"}
  return(outcome)
}

winning_team <- function(home_score, away_score, home_team, away_team) {
  winning_team <- NA
  if (home_score > away_score) {winning_team <- home_team}
  if (home_score < away_score) {winning_team <- away_team}
  return(winning_team)
}

losing_team <- function(home_score, away_score, home_team, away_team) {
  losing_team <- NA
  if (home_score < away_score) {losing_team <- home_team}
  if (home_score > away_score) {losing_team <- away_team}
  return(losing_team)
}

df <- df %>%
  mutate(year = format(date, "%Y"),
         month = format(date, "%b"),
         dayofweek = weekdays(date)) %>%
  rowwise() %>%
  mutate(outcome = game_outcome(home_score, away_score),
         winning_team = winning_team(home_score, away_score, home_team, away_team),
         losing_team = losing_team(home_score, away_score, home_team, away_team)) %>%
  ungroup()


head(df)
## # A tibble: 6 × 15
##   date       home_…¹ away_…² home_…³ away_…⁴ tourn…⁵ city  country neutral year 
##   <date>     <chr>   <chr>     <dbl>   <dbl> <chr>   <chr> <chr>   <lgl>   <chr>
## 1 1872-11-30 Scotla… England       0       0 Friend… Glas… Scotla… FALSE   1872 
## 2 1873-03-08 England Scotla…       4       2 Friend… Lond… England FALSE   1873 
## 3 1874-03-07 Scotla… England       2       1 Friend… Glas… Scotla… FALSE   1874 
## 4 1875-03-06 England Scotla…       2       2 Friend… Lond… England FALSE   1875 
## 5 1876-03-04 Scotla… England       3       0 Friend… Glas… Scotla… FALSE   1876 
## 6 1876-03-25 Scotla… Wales         4       0 Friend… Glas… Scotla… FALSE   1876 
## # … with 5 more variables: month <chr>, dayofweek <chr>, outcome <chr>,
## #   winning_team <chr>, losing_team <chr>, and abbreviated variable names
## #   ¹​home_team, ²​away_team, ³​home_score, ⁴​away_score, ⁵​tournament
# Now, let's do some basic exploration. How many entries? Answer > 38k matches.
dim(df)
## [1] 44353    15
# A journey through the historical landscape of international soccer
# Which teams play the most?
# Let's start by checking which are the most represented teams? This will tell us which are the team with the richest history.
# Surprisingly, Sweden is the team who has played the most games. Most top 10 countries are major soccer nation such as Brazil, Argentina, England, Germany or France. Countries such as Ururguay, Mexico and Hungary are also old teams as they participated to the first world cups (1930 and/or 1934).

all_teams <- data.frame(teams = c(df$home_team, df$away_team), year=as.numeric(c(df$year, df$year)))

all_teams_count <- all_teams %>%
  group_by(teams) %>%
  summarise(number_games = length(teams)) %>%
  arrange(desc(number_games))

head(all_teams_count, 10)
## # A tibble: 10 × 2
##    teams       number_games
##    <chr>              <int>
##  1 Sweden              1053
##  2 England             1049
##  3 Brazil              1021
##  4 Argentina           1018
##  5 Germany              986
##  6 Hungary              966
##  7 Mexico               935
##  8 Uruguay              919
##  9 South Korea          905
## 10 France               880
# It is likely all these teams have a different trajectory, some might have start playing earlier and some later. The plot below displays the cumulative sum of the number of matches for these top 10 teams. Hover the line to display the name of the team. You can also click on a team's name to hide/show it.

top_teams_games_per_year <- all_teams %>%
  filter(teams %in% head(all_teams_count, 10)$teams & year < 2018) %>%
  group_by(teams, year) %>%
  summarise(nb_games = length(year)) %>%
  mutate(year_date=as.Date(paste(year,"-01-01",sep="")))
## `summarise()` has grouped output by 'teams'. You can override using the
## `.groups` argument.
library(plotly)

top_teams_games_per_year <- top_teams_games_per_year %>%
  arrange(teams, year) %>%
  group_by(teams) %>%
  mutate(cumsum=cumsum(nb_games))

p <- ggplot(top_teams_games_per_year, aes(x=year_date, y=cumsum, colour=teams, group=teams)) +
  geom_line() +
  labs(x="Year", y="Cumulated number of games", title="Top 10 teams in total number of games", colour="Click on a team \nto hide/show it")
ggplotly(p)
# The 10 most active teams indded have different trajectories. England gets its second positopm thanks to the many games they played in the 19th century. Some countries such as Sweden, France or Hungary have a more steady progression while teams like Korea or Mexico join the top 10 thanks to their recent hyper activity (Korea's first official games were just before 1950).
# How many games per year?
# Let's now check how many games were played each year and how the total number of international games evolve with time.
tmp <- df %>%
  filter(year < 2018) %>%
  mutate(year = as.numeric(year)) %>%
  group_by(year) %>%
  summarise(nb_games = length(date))  %>%
  ungroup()

ggplot(tmp, aes(x=year, y=nb_games, group=1)) +
  geom_line() +
  labs(x="Year", title="Number of international soccer games", y="") +
  scale_x_continuous(breaks=seq(1870, 2020, 10))

# There are few interestings things going on here:
# * Number of games is rising, with high growth in the 80s/90s.
# * It seems there is a peak around 2010, with a slight decrease since.
# * We see a drop during world wars.
# * Since the 80s, data is very spiky, likely due to the absence/presence of world cups or other events.
#
# Let's try to visualise this to add some understanding to our plot.

wc_years <- c(1930, 1934, 1938, seq(1950, 2014, 4))

tmp <- tmp %>%
  mutate(is_wc = year %in% wc_years)

ggplot(tmp, aes(x=year, y=nb_games, group=1)) +
  geom_line() +
  geom_point(data = tmp %>% filter(is_wc), aes(colour=is_wc)) +
  labs(x="Year", title="Number of international soccer games", y="", colour="World cup year") +
  geom_vline(xintercept=c(1914,1918,1939,1945), lwd=0.3, colour="gray80") +
  scale_x_continuous(breaks=seq(1870, 2020, 10))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

# The two main drops indeed correspond to the 2 world wars but, surprisingly, the world cup years are those counting less matches.
# Let's investigate which are the most common game types and competitions every year, since 2000.

df_competitions <- df %>%
  group_by(tournament, year) %>%
  summarise(nb_games = length(date))
## `summarise()` has grouped output by 'tournament'. You can override using the
## `.groups` argument.
ggplot(df_competitions %>% filter(year >= 2000 & year < 2018),
       aes(x=year, y=nb_games, fill=tournament)) +
  geom_bar(stat="identity") +
  guides(fill=FALSE) +
  labs(x="Year", y="Number of games")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.

# We can see that some events/tournaments are more frequent on non-world cup years such as 2007 or 2011. Let's check what they are.
df_competitions %>% filter(year == 2011) %>% arrange(desc(nb_games))
## # A tibble: 22 × 3
## # Groups:   tournament [22]
##    tournament                           year  nb_games
##    <chr>                                <chr>    <int>
##  1 Friendly                             2011       379
##  2 FIFA World Cup qualification         2011       216
##  3 UEFA Euro qualification              2011       154
##  4 African Cup of Nations qualification 2011        77
##  5 AFC Asian Cup                        2011        32
##  6 AFC Challenge Cup qualification      2011        29
##  7 Island Games                         2011        29
##  8 Pacific Games                        2011        29
##  9 CECAFA Cup                           2011        26
## 10 Copa América                         2011        26
## # … with 12 more rows
df_competitions %>% filter(year == 2010) %>% arrange(desc(nb_games))
## # A tibble: 21 × 3
## # Groups:   tournament [21]
##    tournament                           year  nb_games
##    <chr>                                <chr>    <int>
##  1 Friendly                             2010       423
##  2 UEFA Euro qualification              2010        94
##  3 FIFA World Cup                       2010        64
##  4 African Cup of Nations qualification 2010        48
##  5 CFU Caribbean Cup qualification      2010        34
##  6 African Cup of Nations               2010        29
##  7 AFF Championship                     2010        24
##  8 AFC Asian Cup qualification          2010        19
##  9 CECAFA Cup                           2010        18
## 10 CFU Caribbean Cup                    2010        16
## # … with 11 more rows
# World cup qualifications generates much more matches than the world cup itself, which makes sense as the World Cup only concerns 32 countries. This is well shown in the two plost below: there is no WC qualification matches during a World Cup year and the number of qualification matches is greater than then number of WC matches by a factor 3 to 7 in general.

df_competition_filtered <- df_competitions %>% 
  filter(year >= 2006 & year < 2018 & tournament %in% c("Friendly","UEFA Euro qualification","FIFA World Cup", "FIFA World Cup qualification", "African Cup of Nations qualification")) 

ggplot(df_competition_filtered, aes(x=year, y=nb_games, group=tournament, colour=tournament)) +
  geom_point() +
  geom_line() +
  labs(x="Year", y="Nb games", colour="Competition")

# %% [code]
ggplot(df_competition_filtered, aes(x=year, y=nb_games, group=tournament, fill=tournament)) +
  geom_bar(stat="identity") +
  labs(x="Year", y="Nb games", fill="Competition")

# Worldwide soccer adoption
# When did soccer start to be widely played, i.e. when do most nations start playing international games? The plot below teaches us several things:
# 
# * The number of teams steadily increased 1902 and this increase accelerated up to 1920.
# * From there, the pace of addition of new teams increase much faster and stalls abit around the late 40's
# * Then we see a steady and rapid growth up to the mid 1990's.

df_teams_start <- all_teams %>%
  mutate(year = as.numeric(year)) %>%
  group_by(teams) %>%
  summarise(first_game = min(year))

df_year_teams_start <- df_teams_start %>%
  group_by(first_game) %>%
  summarise(n = length(teams)) %>%
  arrange(first_game) %>%
  mutate(cumsum = cumsum(n))

ggplot(df_year_teams_start, aes(x=first_game, y=cumsum)) +
  geom_line() +
  scale_x_continuous(breaks = seq(1870,2020, 10)) +
  labs(x="Year", title="Cumulative sum of number of international soccer teams", y="")

# Which were the first and last teams to join?

# The four first teams to compete in international games were from what is now forming UK. Soccer then crossed the pond and teams such as Canada, USA, Argentina or Uruguay joined the party. In the same time, central European countries such as Austria and Hungary also join the internation arena.

# Amongst the late joiners we mostly find tiny countries (Vatican or Comoros) and recent ones (Kosovo or South Sudan). We also find Caribean or northern american islands such as which aren;t countries but collectivies or municipalities of countries such as France or Netherlands. ALthough they are not nations, they competed against other countries either in friendly games or in local tournaments.
df_teams_start %>%
  arrange(first_game) %>%
  head(10)
## # A tibble: 10 × 2
##    teams            first_game
##    <chr>                 <dbl>
##  1 England                1872
##  2 Scotland               1872
##  3 Wales                  1876
##  4 Northern Ireland       1882
##  5 Canada                 1885
##  6 United States          1885
##  7 Argentina              1902
##  8 Austria                1902
##  9 Hungary                1902
## 10 Uruguay                1902
df_teams_start %>%
  arrange(first_game) %>%
  tail(10)
## # A tibble: 10 × 2
##    teams             first_game
##    <chr>                  <dbl>
##  1 Surrey                  2018
##  2 Yorkshire               2018
##  3 Chameria                2019
##  4 Saint Helena            2019
##  5 Aymara                  2022
##  6 Biafra                  2022
##  7 Brunei Darussalam       2022
##  8 Mapuche                 2022
##  9 Maule Sur               2022
## 10 Yoruba Nation           2022
# We have seen how different teams and continent started to compete one after the others. Let's now see what did this imply for the game itself and its organisation.
# When do games occur?
# Interstingly, the very first games mostly occur on Saturdays but a decent number also took place on Mondays! No game occurred on a Sunday until 1900, potentially for religious purposes but, around the 1910's Sunday was the most common day of the week to see an international game. Other week days, from Tuesday to Friday, weren't an option until later (as late as 1910 for Fridays).
# 
# The proportion of games happenning on a given day then changed quite a lot. Wednesdays games became very common and around 30% of the games happened on this day around the year 2000. More recently days such as Tuesday, Thursday or Friday also became more popular.
df_games_per_dayofweek <- df %>%
  mutate(year = as.numeric(year)) %>%
  filter(year < 2018) %>%
  group_by(year, dayofweek) %>%
  summarise(n = length(date)) %>%
  group_by(year) %>%
  mutate(perc = n / sum(n) * 100) %>%
  mutate(dayofweek = factor(dayofweek, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_games_per_dayofweek, aes(x=year, y=perc, colour=dayofweek, group=dayofweek)) +
  geom_line() +
  facet_wrap(~dayofweek) +
  labs(x="Year", y="Percentage of games played") +
  guides(colour=FALSE) +
  scale_x_continuous(breaks = seq(1870, 2020, 20)) +
  scale_y_continuous(breaks = seq(0,100, 10)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Now that we have looked at days, let's check whether some months are more popular for soccer games. The first games mostly occur during Spring months and since then, some month have known some peaks of popularity for intenational games at different period (e.g. many games happened in December in the 1940s).
# In a more recent history, international games became less common in May but more in June.
df_games_per_month <- df %>%
  mutate(year = as.numeric(year)) %>%
  filter(year < 2018) %>%
  group_by(year, month) %>%
  summarise(n = length(date)) %>%
  group_by(year) %>%
  mutate(perc = n / sum(n) * 100) %>%
  mutate(month = factor(month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_games_per_month, aes(x=year, y=perc, colour=month, group=month)) +
  geom_line() +
  facet_wrap(~month) +
  labs(x="Year", y="Percentage of games played") +
  guides(colour=FALSE) +
  scale_x_continuous(breaks = seq(1870, 2020, 20)) +
  scale_y_continuous(breaks = seq(0,100, 10)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Evolution of results
 
# Let' know talk about sport and  actual results! First let's check how the proportion of draws and home/away victories evolve through time. Main learnings are:
# * A victory of the home-based team has always been the most likely event.
# * A victory of the visitors is the second most likely outcome, although it tends to decrease in the second half of the 20th century.
# * A draw has always been the least likely outcome, altough it has increased in share since the 1940's.
 
# It is to be noted that the "home" team isn't always playing on his own country, as for example during world or continental cups.

df_outcome_per_year <- df %>%
  mutate(year = as.numeric(year)) %>%
  group_by(year, outcome) %>%
  summarise(n = length(year)) %>%
  group_by(year) %>%
  mutate(total_year = sum(n),
         perc = n / total_year * 100)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_outcome_per_year %>% filter(year > 1900 & year < 2018), aes(x=year, y=perc, group=outcome, colour=outcome)) +
  geom_line() +
  labs(x="Year", y="Percentage of games", colour="Outcome") +
  geom_smooth(se=FALSE, method="loess") +
  scale_x_continuous(breaks = seq(1870, 2020, 20))
## `geom_smooth()` using formula = 'y ~ x'

# Let's now get to what is at the heart of soccer: goals! How did this evolve with time?

# Although it started low (the first game resulted in a 0-0 between Scotland and England), then number of goals per games quickly skyrocketed and, before 1900, the average number of goals per game per year could be as high as 8!
# This average then stabilized around 4 until 1950 and then decreased down to 2.5 in a more modern era. The 80's has been the period were games delivered the lowest number of goals.
df_goals_per_game <- df %>%
  mutate(year = as.numeric(year)) %>%
  group_by(year) %>%
  summarise(nb_games = length(year),
            nb_goals = sum(home_score + away_score),
            goals_per_game = nb_goals / nb_games)

ggplot(df_goals_per_game, aes(x=year, y = goals_per_game)) +
  geom_line() +
  labs(x="Year", y="", title="Average number of goals per game") +
  scale_x_continuous(breaks = seq(1870, 2020, 10))